HEAD
We will first load a standard data frame consisting of several county-level variables:
library(here)
library(sf)
library(maps)
library(stringr)
library(colorRamps)
library(ggplot2)
library(spdep)
library(DCluster)
load ( here("data", "Data.RData") )
summary (dat)
## FIPS Area State Population
## Length:226 Baldwin County : 2 AL: 67 Min. : 1884
## Class :character Bibb County : 2 GA:159 1st Qu.: 13321
## Mode :character Calhoun County : 2 Median : 24790
## Cherokee County: 2 Mean : 62711
## Clarke County : 2 3rd Qu.: 61806
## Clay County : 2 Max. :992137
## (Other) :214
## Cases Income inc
## Min. : 6.0 Min. :16646 Min. : 5.00
## 1st Qu.: 48.0 1st Qu.:27477 1st Qu.: 25.00
## Median : 91.0 Median :31186 Median : 41.00
## Mean : 299.3 Mean :33255 Mean : 45.88
## 3rd Qu.: 223.8 3rd Qu.:36738 3rd Qu.: 61.00
## Max. :6139.0 Max. :71227 Max. :131.00
##
Next, we will read in county spatial polygons for the contiguous US in the maps package. R can also read in any shapefile via the sf package. The st_as_sf function converts the map object to an sf object, which encodes the ID, polygons, and any unit-specific data value.
county.map = map ("county", region = c("alabama", "georgia"),fill = T, plot = F)
county.map = sf::st_as_sf(county.map) #Convert from map object to sf
plot (st_geometry(county.map)) #only plot the polygons
Our sf object contains several attributes:
str (county.map)
## Classes 'sf' and 'data.frame': 226 obs. of 2 variables:
## $ ID : chr "alabama,autauga" "alabama,baldwin" "alabama,barbour" "alabama,bibb" ...
## $ geom:sfc_MULTIPOLYGON of length 226; first list element: List of 1
## ..$ :List of 1
## .. ..$ : num [1:51, 1:2] -86.5 -86.5 -86.5 -86.6 -86.6 ...
## ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
## - attr(*, "sf_column")= chr "geom"
## - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA
## ..- attr(*, "names")= chr "ID"
attributes (county.map)
## $names
## [1] "ID" "geom"
##
## $row.names
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## [91] 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## [145] 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## [163] 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## [181] 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## [199] 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## [217] 217 218 219 220 221 222 223 224 225 226
##
## $sf_column
## [1] "geom"
##
## $agr
## ID
## <NA>
## Levels: constant aggregate identity
##
## $class
## [1] "sf" "data.frame"
Let’s look some meta data information:
print (county.map, n = 2)
## Simple feature collection with 226 features and 1 field
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -88.47614 ymin: 30.24071 xmax: -80.84435 ymax: 35.01345
## Geodetic CRS: WGS 84
## First 2 features:
## ID geom
## 1 alabama,autauga MULTIPOLYGON (((-86.50517 3...
## 2 alabama,baldwin MULTIPOLYGON (((-87.93757 3...
There are several ways we can extract information from an sf object.
#Extract the first polygon and make a plot
poly1 = st_geometry (county.map)[[1]]
#Print all the points used to draw the polygon
poly1
plot (poly1)
Our first data task is to merge the dataset with the sf object. In our case, the counties are ordered by row exactly the same for the two datasets and we can use cbind. In general, it’s good to merge by polygon ID.
#Option 1
dat.areal = cbind (county.map, dat)
dat.areal[1:3,]
## Simple feature collection with 3 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -88.01778 ymin: 30.24071 xmax: -85.06131 ymax: 32.71016
## Geodetic CRS: WGS 84
## ID FIPS Area State Population Cases Income inc
## 1 alabama,autauga 01001 Autauga County AL 49960 184 42013 37
## 2 alabama,baldwin 01003 Baldwin County AL 171769 416 40250 24
## 3 alabama,barbour 01005 Barbour County AL 27941 165 25101 59
## geom
## 1 MULTIPOLYGON (((-86.50517 3...
## 2 MULTIPOLYGON (((-87.93757 3...
## 3 MULTIPOLYGON (((-85.42801 3...
#Option 2
data (county.fips) #Load county 5-digit FIPS code
county.map$FIPS = county.fips$fips[ match (county.map$ID, county.fips$polyname)]
county.map$FIPS = str_pad (county.map$FIPS, 5, "left", "0")
all (county.map$FIPS %in% dat$FIPS)
## [1] TRUE
dat.areal2 = merge (county.map, dat, by = "FIPS")
The object dat.areal$* now contains the polygon information and the dataframe we would like to. We can do standard operations on the dataframe now. We can also create chloropleth map!
dim (dat.areal)
## [1] 226 9
names(dat.areal)
## [1] "ID" "FIPS" "Area" "State" "Population"
## [6] "Cases" "Income" "inc" "geom"
plot (dat.areal["Income"], main = "Median Household Income in 2000")
plot (dat.areal["inc"], main = "Chlamydia Incidence (per 10,000) in 2007")
Or use ggplot.
ggplot() + geom_sf (data = dat.areal, aes (fill = Income))+
labs(title="Median Household Income in 2000") +scale_fill_gradient2(low = "white",high = "red", limits = c(0, 80000))
ggplot() + geom_sf (data = dat.areal, aes (fill = inc))+
labs(title="Chlamydia Incidence (per 10,000)") +scale_fill_gradient2(low = "white",high = "blue", limits = c(0, 140))
The package spdep () contains a suite of functions for working with areal spatial data. The poly2nb () function identifies neighbours using a spatial polygons. Neighbours are defined as sharing a common boundary point.The default in poly2nb uses the queen’s case definition that is two polygons are neighbours even if they share a single boundary. This can be suppressed by the queen = option.
#Because our shapefile here is un-projected, distance operations can be difficult. Alternatively, we can project the data to, for example, the Lambert Conformal Conic North America.
dat.areal_proj = st_transform(dat.areal, crs = "ESRI:102004")
plot (st_geometry(dat.areal_proj))
nb = poly2nb (dat.areal_proj)
nb2 = poly2nb (dat.areal_proj, queen=FALSE)
#A lot of good summaries about the neighbor structure
summary (nb)
## Neighbour list object:
## Number of regions: 226
## Number of nonzero links: 1258
## Percentage nonzero weights: 2.462996
## Average number of links: 5.566372
## Link number distribution:
##
## 2 3 4 5 6 7 8 9 10
## 3 12 37 50 70 41 8 3 2
## 3 least connected regions:
## 90 92 186 with 2 links
## 2 most connected regions:
## 120 127 with 10 links
The function nb2mat () then creats a proximity/weight matrix using the neighbourhood information. Here we have several options to define weights:
W = nb2mat (nb, style = "B")
center_proj = st_centroid(dat.areal_proj) ##Extract centroids of polygons
center = st_transform(center_proj, crs = 4326) #Re-project back to WGS 84
center = st_coordinates (center)
plot (st_geometry(dat.areal))
plot (nb, center, add = TRUE, col = "blue")
plot (nb2, center, add = TRUE, col = "red", lwd = 1)
title (main="Blue lines = additional neighbours by queen's case")
## Using kth-nearest distance
nb.k1 = knn2nb (knearneigh(center_proj,k=1))
nb.k2 = knn2nb (knearneigh(center_proj,k=2), row.names=row.names(center))
plot (st_geometry(dat.areal))
plot (nb.k2, center, add = TRUE, col = "red", lwd = 1)
plot (nb.k1, center, add = TRUE, col = "blue", lwd = 1)
title (main="Blue lines = additional second nearest-neighbour")
## Using buffer distance
nb.buffer1 = dnearneigh(center_proj, d1=0, d2 = 25*1000)
nb.buffer2 = dnearneigh(center_proj, d1=0, d2 = 40*1000)
plot (st_geometry(dat.areal))
plot (nb.buffer2, center, add = TRUE, col = "red", lwd = 1)
plot (nb.buffer1, center, add = TRUE, col = "blue", lwd =2)
legend ("topright", legend =c("<25 km", "< 40 km"), col= c("blue", "red"), pch = 16)
Let’s first produce a Moran Plot. We first show how to do it by hand. The moran.plot () function in spdep will also identify high influence points.
Y = dat.areal$Income
nb = poly2nb (dat.areal_proj)
W = nb2mat (nb, style = "W")
WY = W%*%Y
plot (WY~Y, col = 4, ylab = "Spatially Lagged Wegithed Income", xlab = "Income", cex.lab = 1.4, cex.axis = 1.2, cex = 1.2)
abline (h = mean (WY), lty = 2); abline(v=mean(Y), lty=2)
abline(0,1)
#Here we need to use the weighted adjancy matrix (defult in *nb2listw*)
moran.plot (Y, nb2listw(nb))
Calculate Moran’s I by hand and perform asymptotic hypothesis test.
ybar = mean (Y)
r = Y - ybar
I = sum(r%*% t(r)*W)/ sum (r^2)*nrow(W)/sum(W)
I
## [1] 0.5409861
#Note that the Moran's statistic is the same as the slope from the previous regression model
coef (lm (WY~Y))
## (Intercept) Y
## 1.517555e+04 5.409861e-01
#The default performs non-Gaussian test under randomization
col.W <- nb2listw(nb, style="W")
moran.test (Y, col.W)
##
## Moran I test under randomisation
##
## data: Y
## weights: col.W
##
## Moran I statistic standard deviate = 13.634, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.540986055 -0.004444444 0.001600329
#Assume Y is Gaussian
moran.test (Y, col.W, randomisation = FALSE)
##
## Moran I test under normality
##
## data: Y
## weights: col.W
##
## Moran I statistic standard deviate = 13.547, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.540986055 -0.004444444 0.001620976
Moran’s I hypothesis test by permutation
I.perm = moran.mc (Y, col.W, 10000)
I.perm
##
## Monte-Carlo simulation of Moran I
##
## data: Y
## weights: col.W
## number of simulations + 1: 10001
##
## statistic = 0.54099, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater
hist (I.perm$res, main = "Moran's I under Null Hypothesis", xlab = "", ylab ="")
abline (v = 0.567, col = 2, lwd = 4)
text (0.5, 20000, "Observed \n Moran's I", col = 2)
Geary’s C
##Geary's C
geary.test (Y, col.W)
##
## Geary C test under randomisation
##
## data: Y
## weights: col.W
##
## Geary C statistic standard deviate = 12.987, p-value < 2.2e-16
## alternative hypothesis: Expectation greater than statistic
## sample estimates:
## Geary C statistic Expectation Variance
## 0.425651698 1.000000000 0.001955928
C.perm = geary.mc (Y, col.W, 10000)
hist (C.perm$res, main = "Geary's C under Null Hypothesis", xlab = "", ylab ="")
abline (v = 0.4295, col = 2, lwd = 4)
text (0.5, 15000, "Observed \n Geary's C", col = 2)
# 3.4 Local Moran’s I
To estimate local Moran’s I for chlaymia incidence rate, we need a first-order adjacency matrix W. The localmoran () results gives:
Y = dat.areal$inc
col.W <- nb2listw(nb, style="B")
I.local = localmoran (Y, col.W)
I.local[1:4,]
## Ii E.Ii Var.Ii Z.Ii Pr(z != E(Ii))
## 1 -2.340030 -0.002422875 0.5375311 -3.1883775 0.001430736
## 2 -1.103451 -0.011766813 2.6159660 -0.6749646 0.499698269
## 3 2.445618 -0.007403101 1.6265637 1.9233832 0.054431941
## 4 -4.071777 -0.011786781 2.5992363 -2.5182691 0.011793319
Next, we extract the local Moran’s I results and include them in our spatial dataframe.
dat.areal$I.local = I.local[,1]
I.local.p = I.local[,5]
I.local.p_bonf = p.adjust (I.local[,5], method ="bonferroni")
I.local.p_holm = p.adjust (I.local[,5], method ="holm")
I.local.p_fdr = p.adjust (I.local[,5], method ="fdr")
plot.dat = data.frame (FIPS = dat.areal$FIPS,
type =rep(c("Raw", "Bonferroni", "Holm", "FDR"), each = length (I.local.p)),
p = c(I.local.p, I.local.p_bonf, I.local.p_holm, I.local.p_fdr))
plot.dat$p=as.factor(cut(plot.dat$p, c(0,0.01, 0.05, 1), right=TRUE ))
levels (plot.dat$p) = c("<0.01", "0.01-0.05", ">0.05")
plot.dat$type = factor(plot.dat$type, levels = c("Raw", "Bonferroni", "Holm", "FDR"))
plot.dat = merge (dat.areal, plot.dat, by = "FIPS")
ggplot () + geom_sf (data = plot.dat, aes (fill = p)) + facet_wrap(~type)+
scale_fill_discrete(name = "p-value")
To avoid relying on asymptotic normality, we can also perform a permutation test. Here we re-sample the incidence data with replacement 10,000 times. Each time, we calculate the local Moran’s I statistics. First, note that under H_0 of no spatial dependence, the local Moran’s I statistics exhibit large skewness compared to a normal distribution.
#Number of permutation
n.iter = 10000
#Row = 226 counties, column = permutation
I.keep = matrix (NA, ncol = n.iter, nrow = length (Y))
Y = dat.areal$inc
for (i in 1:n.iter){
if (i %% 1000 ==0){print (i)}
I.keep[,i] <- localmoran (sample (Y, length(Y), replace=T), col.W)[,1]
}
## [1] 1000
## [1] 2000
## [1] 3000
## [1] 4000
## [1] 5000
## [1] 6000
## [1] 7000
## [1] 8000
## [1] 9000
## [1] 10000
#Check the normality for local Moran's I for county 1 and county 20
par (mfrow = c(1,2))
qqnorm(I.keep[1,], main = "Moran's I (Null Distribution)\n County ID = 1");qqline(I.keep[1,])
qqnorm(I.keep[20,], main = "Moran's I (Null Distribution)\n County ID = 20");qqline(I.keep[20,])
I.obs = localmoran (Y, col.W)[,1]
#Calculate P(Local Moran's I > observed | Null)
P_perm = apply ( sweep (I.keep, 1, I.obs, ">" ), 1, mean)
P_perm[P_perm==0] = 1/n.iter
dat.areal$MoranI_perm_p= cut(p.adjust(P_perm, "bonferroni"), c(0,0.01, 0.05, 1) )
levels (dat.areal$MoranI_perm_p) = c("<0.01", "0.01-0.05", ">0.05")
ggplot () + geom_sf (data = dat.areal, aes (fill = MoranI_perm_p)) +
ggtitle("Permutation-based Bonferroni-corrected") + scale_fill_discrete(name = "p-value")
We will now focus on performing cluster detection specifically for case incidence data. We will use functions in the R package DCluster. These functions often require the input as a dataset with two variable names: Observed and Expected. First, we perform tests to see if spatial clustering exists.
dismap = data.frame (Observed = dat.areal$Cases, Pop = dat.areal$Population)
theta_0 = sum(dismap$Observed)/sum(dismap$Pop)
dismap$Expected = dismap$Pop * theta_0
################
# Chi2 test #
################
#Asymptotic test
achisq.stat (dismap)
## $T
## [1] 21755.29
##
## $df
## [1] 225
##
## $pvalue
## [1] 0
#Simulation-based
achisq.test(Observed~offset(log(Expected)), data = dismap, model = "poisson", R=1000)
## Chi-square test for overdispersion
##
## Type of boots.: parametric
## Model used when sampling: Poisson
## Number of simulations: 1000
## Statistic: 21755.29
## p-value : 0.000999001
achisq.test(Observed~offset(log(Expected)), data = dismap, model = "multinom", R=1000)
## Chi-square test for overdispersion
##
## Type of boots.: parametric
## Model used when sampling: Multinomial
## Number of simulations: 1000
## Statistic: 21755.29
## p-value : 0.000999001
achisq.test(Observed~offset(log(Expected)), data = dismap, model = "negbin", R=1000)
## Chi-square test for overdispersion
##
## Type of boots.: parametric
## Model used when sampling: Negative Binomial
## Number of simulations: 1000
## Statistic: 21755.29
## p-value : 0.3976024
##############################
# Potthoff-Whittinghill test #
##############################
pottwhitt.test(Observed~offset(log(Expected)), data = dismap, model = "poisson", R=1000)
## Potthoff-Whittinghill's test of overdispersion
##
## Type of boots.: parametric
## Model used when sampling: Poisson
## Number of simulations: 1000
## Statistic: 6032784494
## p-value : 0.000999001
pottwhitt.test(Observed~offset(log(Expected)), data = dismap, model = "multinom", R=1000)
## Potthoff-Whittinghill's test of overdispersion
##
## Type of boots.: parametric
## Model used when sampling: Multinomial
## Number of simulations: 1000
## Statistic: 6032784494
## p-value : 0.000999001
###############
# Moran's I #
###############
nb = poly2nb (dat.areal_proj)
col.W = nb2listw(nb)
moranI.test (Observed~offset(log(Expected)), data = dismap, model = "poisson", R=1000, listw = col.W, n = nrow(dismap), S0 = Szero(col.W))
## Moran's I test of spatial autocorrelation
##
## Type of boots.: parametric
## Model used when sampling: Poisson
## Number of simulations: 1000
## Statistic: 0.3525243
## p-value : 0.000999001
We will first load a standard data frame consisting of several county-level variables:
library(here)
library(sf)
library(maps)
library(stringr)
library(colorRamps)
library(ggplot2)
library(spdep)
library(DCluster)
load ( here("data", "Data.RData") )
summary (dat)
## FIPS Area State Population
## Length:226 Baldwin County : 2 AL: 67 Min. : 1884
## Class :character Bibb County : 2 GA:159 1st Qu.: 13321
## Mode :character Calhoun County : 2 Median : 24790
## Cherokee County: 2 Mean : 62711
## Clarke County : 2 3rd Qu.: 61806
## Clay County : 2 Max. :992137
## (Other) :214
## Cases Income inc
## Min. : 6.0 Min. :16646 Min. : 5.00
## 1st Qu.: 48.0 1st Qu.:27477 1st Qu.: 25.00
## Median : 91.0 Median :31186 Median : 41.00
## Mean : 299.3 Mean :33255 Mean : 45.88
## 3rd Qu.: 223.8 3rd Qu.:36738 3rd Qu.: 61.00
## Max. :6139.0 Max. :71227 Max. :131.00
##
Next, we will read in county spatial polygons for the contiguous US in the maps package. R can also read in any shapefile via the sf package. The st_as_sf function converts the map object to an sf object, which encodes the ID, polygons, and any unit-specific data value.
county.map = map ("county", region = c("alabama", "georgia"),fill = T, plot = F)
county.map = sf::st_as_sf(county.map) #Convert from map object to sf
plot (st_geometry(county.map)) #only plot the polygons
Our sf object contains several attributes:
str (county.map)
## Classes 'sf' and 'data.frame': 226 obs. of 2 variables:
## $ ID : chr "alabama,autauga" "alabama,baldwin" "alabama,barbour" "alabama,bibb" ...
## $ geom:sfc_MULTIPOLYGON of length 226; first list element: List of 1
## ..$ :List of 1
## .. ..$ : num [1:51, 1:2] -86.5 -86.5 -86.5 -86.6 -86.6 ...
## ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
## - attr(*, "sf_column")= chr "geom"
## - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA
## ..- attr(*, "names")= chr "ID"
attributes (county.map)
## $names
## [1] "ID" "geom"
##
## $row.names
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## [55] 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## [73] 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## [91] 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
## [109] 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
## [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
## [145] 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
## [163] 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## [181] 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
## [199] 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
## [217] 217 218 219 220 221 222 223 224 225 226
##
## $sf_column
## [1] "geom"
##
## $agr
## ID
## <NA>
## Levels: constant aggregate identity
##
## $class
## [1] "sf" "data.frame"
Let’s look some meta data information:
print (county.map, n = 2)
## Simple feature collection with 226 features and 1 field
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -88.47614 ymin: 30.24071 xmax: -80.84435 ymax: 35.01345
## Geodetic CRS: WGS 84
## First 2 features:
## ID geom
## 1 alabama,autauga MULTIPOLYGON (((-86.50517 3...
## 2 alabama,baldwin MULTIPOLYGON (((-87.93757 3...
There are several ways we can extract information from an sf object.
#Extract the first polygon and make a plot
poly1 = st_geometry (county.map)[[1]]
#Print all the points used to draw the polygon
poly1
plot (poly1)
Our first data task is to merge the dataset with the sf object. In our case, the counties are ordered by row exactly the same for the two datasets and we can use cbind. In general, it’s good to merge by polygon ID.
#Option 1
dat.areal = cbind (county.map, dat)
dat.areal[1:3,]
## Simple feature collection with 3 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -88.01778 ymin: 30.24071 xmax: -85.06131 ymax: 32.71016
## Geodetic CRS: WGS 84
## ID FIPS Area State Population Cases Income inc
## 1 alabama,autauga 01001 Autauga County AL 49960 184 42013 37
## 2 alabama,baldwin 01003 Baldwin County AL 171769 416 40250 24
## 3 alabama,barbour 01005 Barbour County AL 27941 165 25101 59
## geom
## 1 MULTIPOLYGON (((-86.50517 3...
## 2 MULTIPOLYGON (((-87.93757 3...
## 3 MULTIPOLYGON (((-85.42801 3...
#Option 2
data (county.fips) #Load county 5-digit FIPS code
county.map$FIPS = county.fips$fips[ match (county.map$ID, county.fips$polyname)]
county.map$FIPS = str_pad (county.map$FIPS, 5, "left", "0")
all (county.map$FIPS %in% dat$FIPS)
## [1] TRUE
dat.areal2 = merge (county.map, dat, by = "FIPS")
The object dat.areal$* now contains the polygon information and the dataframe we would like to. We can do standard operations on the dataframe now. We can also create chloropleth map!
dim (dat.areal)
## [1] 226 9
names(dat.areal)
## [1] "ID" "FIPS" "Area" "State" "Population"
## [6] "Cases" "Income" "inc" "geom"
plot (dat.areal["Income"], main = "Median Household Income in 2000")
plot (dat.areal["inc"], main = "Chlamydia Incidence (per 10,000) in 2007")
Or use ggplot.
ggplot() + geom_sf (data = dat.areal, aes (fill = Income))+
labs(title="Median Household Income in 2000") +scale_fill_gradient2(low = "white",high = "red", limits = c(0, 80000))
ggplot() + geom_sf (data = dat.areal, aes (fill = inc))+
labs(title="Chlamydia Incidence (per 10,000)") +scale_fill_gradient2(low = "white",high = "blue", limits = c(0, 140))
The package spdep () contains a suite of functions for working with areal spatial data. The poly2nb () function identifies neighbours using a spatial polygons. Neighbours are defined as sharing a common boundary point.The default in poly2nb uses the queen’s case definition that is two polygons are neighbours even if they share a single boundary. This can be suppressed by the queen = option.
#Because our shapefile here is un-projected, distance operations can be difficult. Alternatively, we can project the data to, for example, the Lambert Conformal Conic North America.
dat.areal_proj = st_transform(dat.areal, crs = "ESRI:102004")
plot (st_geometry(dat.areal_proj))
nb = poly2nb (dat.areal_proj)
nb2 = poly2nb (dat.areal_proj, queen=FALSE)
#A lot of good summaries about the neighbor structure
summary (nb)
## Neighbour list object:
## Number of regions: 226
## Number of nonzero links: 1258
## Percentage nonzero weights: 2.462996
## Average number of links: 5.566372
## Link number distribution:
##
## 2 3 4 5 6 7 8 9 10
## 3 12 37 50 70 41 8 3 2
## 3 least connected regions:
## 90 92 186 with 2 links
## 2 most connected regions:
## 120 127 with 10 links
The function nb2mat () then creats a proximity/weight matrix using the neighbourhood information. Here we have several options to define weights:
W = nb2mat (nb, style = "B")
center_proj = st_centroid(dat.areal_proj) ##Extract centroids of polygons
center = st_transform(center_proj, crs = 4326) #Re-project back to WGS 84
center = st_coordinates (center)
plot (st_geometry(dat.areal))
plot (nb, center, add = TRUE, col = "blue")
plot (nb2, center, add = TRUE, col = "red", lwd = 1)
title (main="Blue lines = additional neighbours by queen's case")
## Using kth-nearest distance
nb.k1 = knn2nb (knearneigh(center_proj,k=1))
nb.k2 = knn2nb (knearneigh(center_proj,k=2), row.names=row.names(center))
plot (st_geometry(dat.areal))
plot (nb.k2, center, add = TRUE, col = "red", lwd = 1)
plot (nb.k1, center, add = TRUE, col = "blue", lwd = 1)
title (main="Blue lines = additional second nearest-neighbour")
## Using buffer distance
nb.buffer1 = dnearneigh(center_proj, d1=0, d2 = 25*1000)
nb.buffer2 = dnearneigh(center_proj, d1=0, d2 = 40*1000)
plot (st_geometry(dat.areal))
plot (nb.buffer2, center, add = TRUE, col = "red", lwd = 1)
plot (nb.buffer1, center, add = TRUE, col = "blue", lwd =2)
legend ("topright", legend =c("<25 km", "< 40 km"), col= c("blue", "red"), pch = 16)
Let’s first produce a Moran Plot. We first show how to do it by hand. The moran.plot () function in spdep will also identify high influence points.
Y = dat.areal$Income
nb = poly2nb (dat.areal_proj)
W = nb2mat (nb, style = "W")
WY = W%*%Y
plot (WY~Y, col = 4, ylab = "Spatially Lagged Wegithed Income", xlab = "Income", cex.lab = 1.4, cex.axis = 1.2, cex = 1.2)
abline (h = mean (WY), lty = 2); abline(v=mean(Y), lty=2)
abline(0,1)
#Here we need to use the weighted adjancy matrix (defult in *nb2listw*)
moran.plot (Y, nb2listw(nb))
Calculate Moran’s I by hand and perform asymptotic hypothesis test.
ybar = mean (Y)
r = Y - ybar
I = sum(r%*% t(r)*W)/ sum (r^2)*nrow(W)/sum(W)
I
## [1] 0.5409861
#Note that the Moran's statistic is the same as the slope from the previous regression model
coef (lm (WY~Y))
## (Intercept) Y
## 1.517555e+04 5.409861e-01
#The default performs non-Gaussian test under randomization
col.W <- nb2listw(nb, style="W")
moran.test (Y, col.W)
##
## Moran I test under randomisation
##
## data: Y
## weights: col.W
##
## Moran I statistic standard deviate = 13.634, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.540986055 -0.004444444 0.001600329
#Assume Y is Gaussian
moran.test (Y, col.W, randomisation = FALSE)
##
## Moran I test under normality
##
## data: Y
## weights: col.W
##
## Moran I statistic standard deviate = 13.547, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.540986055 -0.004444444 0.001620976
Moran’s I hypothesis test by permutation
I.perm = moran.mc (Y, col.W, 10000)
I.perm
##
## Monte-Carlo simulation of Moran I
##
## data: Y
## weights: col.W
## number of simulations + 1: 10001
##
## statistic = 0.54099, observed rank = 10001, p-value = 9.999e-05
## alternative hypothesis: greater
hist (I.perm$res, main = "Moran's I under Null Hypothesis", xlab = "", ylab ="")
abline (v = 0.567, col = 2, lwd = 4)
text (0.5, 20000, "Observed \n Moran's I", col = 2)
Geary’s C
##Geary's C
geary.test (Y, col.W)
##
## Geary C test under randomisation
##
## data: Y
## weights: col.W
##
## Geary C statistic standard deviate = 12.987, p-value < 2.2e-16
## alternative hypothesis: Expectation greater than statistic
## sample estimates:
## Geary C statistic Expectation Variance
## 0.425651698 1.000000000 0.001955928
C.perm = geary.mc (Y, col.W, 10000)
hist (C.perm$res, main = "Geary's C under Null Hypothesis", xlab = "", ylab ="")
abline (v = 0.4295, col = 2, lwd = 4)
text (0.5, 15000, "Observed \n Geary's C", col = 2)
# 3.4 Local Moran’s I
To estimate local Moran’s I for chlaymia incidence rate, we need a first-order adjacency matrix W. The localmoran () results gives:
Y = dat.areal$inc
col.W <- nb2listw(nb, style="B")
I.local = localmoran (Y, col.W)
I.local[1:4,]
## Ii E.Ii Var.Ii Z.Ii Pr(z != E(Ii))
## 1 -2.340030 -0.002422875 0.5375311 -3.1883775 0.001430736
## 2 -1.103451 -0.011766813 2.6159660 -0.6749646 0.499698269
## 3 2.445618 -0.007403101 1.6265637 1.9233832 0.054431941
## 4 -4.071777 -0.011786781 2.5992363 -2.5182691 0.011793319
Next, we extract the local Moran’s I results and include them in our spatial dataframe.
dat.areal$I.local = I.local[,1]
I.local.p = I.local[,5]
I.local.p_bonf = p.adjust (I.local[,5], method ="bonferroni")
I.local.p_holm = p.adjust (I.local[,5], method ="holm")
I.local.p_fdr = p.adjust (I.local[,5], method ="fdr")
plot.dat = data.frame (FIPS = dat.areal$FIPS,
type =rep(c("Raw", "Bonferroni", "Holm", "FDR"), each = length (I.local.p)),
p = c(I.local.p, I.local.p_bonf, I.local.p_holm, I.local.p_fdr))
plot.dat$p=as.factor(cut(plot.dat$p, c(0,0.01, 0.05, 1), right=TRUE ))
levels (plot.dat$p) = c("<0.01", "0.01-0.05", ">0.05")
plot.dat$type = factor(plot.dat$type, levels = c("Raw", "Bonferroni", "Holm", "FDR"))
plot.dat = merge (dat.areal, plot.dat, by = "FIPS")
ggplot () + geom_sf (data = plot.dat, aes (fill = p)) + facet_wrap(~type)+
scale_fill_discrete(name = "p-value")
To avoid relying on asymptotic normality, we can also perform a permutation test. Here we re-sample the incidence data with replacement 10,000 times. Each time, we calculate the local Moran’s I statistics. First, note that under H_0 of no spatial dependence, the local Moran’s I statistics exhibit large skewness compared to a normal distribution.
#Number of permutation
n.iter = 10000
#Row = 226 counties, column = permutation
I.keep = matrix (NA, ncol = n.iter, nrow = length (Y))
Y = dat.areal$inc
for (i in 1:n.iter){
if (i %% 1000 ==0){print (i)}
I.keep[,i] <- localmoran (sample (Y, length(Y), replace=T), col.W)[,1]
}
## [1] 1000
## [1] 2000
## [1] 3000
## [1] 4000
## [1] 5000
## [1] 6000
## [1] 7000
## [1] 8000
## [1] 9000
## [1] 10000
#Check the normality for local Moran's I for county 1 and county 20
par (mfrow = c(1,2))
qqnorm(I.keep[1,], main = "Moran's I (Null Distribution)\n County ID = 1");qqline(I.keep[1,])
qqnorm(I.keep[20,], main = "Moran's I (Null Distribution)\n County ID = 20");qqline(I.keep[20,])
I.obs = localmoran (Y, col.W)[,1]
#Calculate P(Local Moran's I > observed | Null)
P_perm = apply ( sweep (I.keep, 1, I.obs, ">" ), 1, mean)
P_perm[P_perm==0] = 1/n.iter
dat.areal$MoranI_perm_p= cut(p.adjust(P_perm, "bonferroni"), c(0,0.01, 0.05, 1) )
levels (dat.areal$MoranI_perm_p) = c("<0.01", "0.01-0.05", ">0.05")
ggplot () + geom_sf (data = dat.areal, aes (fill = MoranI_perm_p)) +
ggtitle("Permutation-based Bonferroni-corrected") + scale_fill_discrete(name = "p-value")
We will now focus on performing cluster detection specifically for case incidence data. We will use functions in the R package DCluster. These functions often require the input as a dataset with two variable names: Observed and Expected. First, we perform tests to see if spatial clustering exists.
dismap = data.frame (Observed = dat.areal$Cases, Pop = dat.areal$Population)
theta_0 = sum(dismap$Observed)/sum(dismap$Pop)
dismap$Expected = dismap$Pop * theta_0
################
# Chi2 test #
################
#Asymptotic test
achisq.stat (dismap)
## $T
## [1] 21755.29
##
## $df
## [1] 225
##
## $pvalue
## [1] 0
#Simulation-based
achisq.test(Observed~offset(log(Expected)), data = dismap, model = "poisson", R=1000)
## Chi-square test for overdispersion
##
## Type of boots.: parametric
## Model used when sampling: Poisson
## Number of simulations: 1000
## Statistic: 21755.29
## p-value : 0.000999001
achisq.test(Observed~offset(log(Expected)), data = dismap, model = "multinom", R=1000)
## Chi-square test for overdispersion
##
## Type of boots.: parametric
## Model used when sampling: Multinomial
## Number of simulations: 1000
## Statistic: 21755.29
## p-value : 0.000999001
achisq.test(Observed~offset(log(Expected)), data = dismap, model = "negbin", R=1000)
## Chi-square test for overdispersion
##
## Type of boots.: parametric
## Model used when sampling: Negative Binomial
## Number of simulations: 1000
## Statistic: 21755.29
## p-value : 0.3976024
##############################
# Potthoff-Whittinghill test #
##############################
pottwhitt.test(Observed~offset(log(Expected)), data = dismap, model = "poisson", R=1000)
## Potthoff-Whittinghill's test of overdispersion
##
## Type of boots.: parametric
## Model used when sampling: Poisson
## Number of simulations: 1000
## Statistic: 6032784494
## p-value : 0.000999001
pottwhitt.test(Observed~offset(log(Expected)), data = dismap, model = "multinom", R=1000)
## Potthoff-Whittinghill's test of overdispersion
##
## Type of boots.: parametric
## Model used when sampling: Multinomial
## Number of simulations: 1000
## Statistic: 6032784494
## p-value : 0.000999001
###############
# Moran's I #
###############
nb = poly2nb (dat.areal_proj)
col.W = nb2listw(nb)
moranI.test (Observed~offset(log(Expected)), data = dismap, model = "poisson", R=1000, listw = col.W, n = nrow(dismap), S0 = Szero(col.W))
## Moran's I test of spatial autocorrelation
##
## Type of boots.: parametric
## Model used when sampling: Poisson
## Number of simulations: 1000
## Statistic: 0.3525243
## p-value : 0.000999001